In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
%matplotlib inline
def rmse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
In [2]:
df_train = pd.read_csv('train.csv')
df_train.head()
Out[2]:
In [3]:
df_null = df_train.isnull().sum()
#print (len(df_null))
df1 = pd.DataFrame()
for idx_num in range(len(df_null)):
if df_null.values[idx_num] != 0:
df_temp = pd.DataFrame(index = [df_null.index[idx_num]],data = [df_null.values[idx_num]])
df1 = df1.append(df_temp)
print (df1)
In [4]:
# handle the missing data
# Alley : NA means no alley, so change it to 'None'
df_train.loc[:,'Alley'] = df_train.loc[:,'Alley'].fillna('None')
# FireplaceQu : NA means no Fireplace
df_train.loc[:,'FireplaceQu'] = df_train.loc[:,'FireplaceQu'].fillna('None')
# PoolQC : NA means no pool
df_train.loc[:,'PoolQC'] = df_train.loc[:,'PoolQC'].fillna('None')
# Fence : NA means no fence
df_train.loc[:,'Fence'] = df_train.loc[:,'Fence'].fillna('None')
# MiscFeature : NA means no feature, and its MiscVal should be 0
df_train.loc[:,'MiscFeature'] = df_train.loc[:,'MiscFeature'].fillna('None')
## LotFrontage : make it 0 first
df_train.loc[:,'LotFrontage'] = df_train.loc[:,'LotFrontage'].fillna(0)
## MasVnrType : NaN most likely to be None, make it None first
df_train.loc[:,'MasVnrType'] = df_train.loc[:,'MasVnrType'].fillna('None')
## MasVnrArea : NaN is the same with MasVnrType, make it 0 first
df_train.loc[:,'MasVnrArea'] = df_train.loc[:,'MasVnrArea'].fillna(0)
# BsmtQual : NA means no basement
df_train.loc[:,'BsmtQual'] = df_train.loc[:,'BsmtQual'].fillna('None')
# BsmtCond : NA means no basement
df_train.loc[:,'BsmtCond'] = df_train.loc[:,'BsmtCond'].fillna('None')
# BsmtExposure : NA means no basement, but index 948 is strange, based on other data with unfinished basement, the Nan is mostly 'No'
df_train.loc[948,'BsmtExposure'] = 'No'
df_train.loc[:,'BsmtExposure'] = df_train.loc[:,'BsmtExposure'].fillna('None')
# BsmtFinType1 : NA means no basement
df_train.loc[:,'BsmtFinType1'] = df_train.loc[:,'BsmtFinType1'].fillna('None')
# BsmtFinType2 : NA means no basement
df_train.loc[:,'BsmtFinType2'] = df_train.loc[:,'BsmtFinType2'].fillna('None')
# Electrical : only one NA, and hard to interpret, just drop the data
df_train = df_train[df_train['Electrical'].notnull()]
# GarageType, GarageYrBlt,GarageFinish,GarageQual,GarageCond : NA means no garage
df_train.loc[:,'GarageType'] = df_train.loc[:,'GarageType'].fillna('None')
df_train.loc[:,'GarageFinish'] = df_train.loc[:,'GarageFinish'].fillna('None')
df_train.loc[:,'GarageQual'] = df_train.loc[:,'GarageQual'].fillna('None')
df_train.loc[:,'GarageCond'] = df_train.loc[:,'GarageCond'].fillna('None')
## Tansfer GarageYrBlt later, fill in 0 first, then deal with it
df_train.loc[:,'GarageYrBlt'] = df_train.loc[:,'GarageYrBlt'].fillna(0)
In [5]:
# handle categorical like feature value
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
def apply_label_encoder(data,feature_name):
le.fit(df_train[feature_name])
data[feature_name] = le.transform(data[feature_name])
labelEncoder_list = ['MSSubClass','MSZoning','Street','Alley','LandContour','LotConfig','Neighborhood','Condition1',
'Condition2','BldgType','HouseStyle','YearBuilt','YearRemodAdd','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','MasVnrArea',
'Foundation','BsmtFinSF1','Heating','CentralAir','Electrical','GarageYrBlt','Fence','MiscFeature','MoSold','YrSold','SaleType','SaleCondition']
for feature in labelEncoder_list:
apply_label_encoder(df_train,feature)
def apply_qualify_encoder(data,feature_name):
return data.replace({feature_name:{"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}})
qual_labelEncoder_list = ['ExterCond','ExterQual','HeatingQC','KitchenQual','BsmtQual','BsmtCond','FireplaceQu','GarageCond','GarageQual','PoolQC']
for feature in qual_labelEncoder_list:
df_train = apply_qualify_encoder(df_train,feature)
#special ones
df_train = df_train.replace({"BsmtExposure" : {"None" : 0,"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
"BsmtFinType1" : {"None" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, "ALQ" : 5, "GLQ" : 6},
"BsmtFinType2" : {"None" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, "ALQ" : 5, "GLQ" : 6},
"Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5,
"Min2" : 6, "Min1" : 7, "Typ" : 8},
"LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
"LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
"PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
"Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4},
"GarageFinish": {"None" : 0, "Unf":1, "RFn":2,"Fin":3},
"GarageType" : {"None":0, "Detchd":1, "CarPort":2, "BuiltIn":3, "Basment":4, "Attchd":5,"2Types":6}})
In [6]:
# drop 'Id', it is no relationship with SalePrice
df_train = df_train.drop('Id',axis=1)
In [7]:
corrmat = df_train.corr()
f,ax = plt.subplots(figsize=(20,17))
sns.heatmap(corrmat)
Out[7]:
In [8]:
corrmat.nlargest(len(df_train),'SalePrice')['SalePrice']
Out[8]:
In [9]:
def data_observation(X_feature,y_feature):
plt.scatter(df_train[X_feature], df_train[y_feature], c = "blue", marker = "s")
plt.title("Observation of X and SalePrice")
plt.xlabel(X_feature)
plt.ylabel(y_feature)
plt.show()
In [10]:
# for i in df_train.columns:
# if i != 'SalePrice':
# data_observation(i,'SalePrice')
In [11]:
# remove outliers
df_train[(df_train['LotFrontage']>250)]
df_train = df_train.drop([934,1298])
df_train[(df_train['MasVnrArea']<50)&(df_train['SalePrice']>700000)]
df_train = df_train.drop([1182])
df_train[(df_train['BsmtFinSF1']>50)&(df_train['SalePrice']>500000)].loc[:,'BsmtFinSF1']
df_train = df_train.drop([1046])
# df_train[(df_train['GrLivArea']>5000)]
# df_train = df_train.drop([1298])
df_train[(df_train['GarageYrBlt']>60)&(df_train['SalePrice']>620000)]
df_train = df_train.drop([691,1169])
df_train[(df_train['OpenPorchSF']>500)&(df_train['SalePrice']<100000)]
df_train = df_train.drop([495])
df_train[(df_train['YearBuilt']>80) & (df_train['SalePrice']>600000)]
df_train = df_train.drop([898])
df_train[(df_train['YearBuilt']<20) & (df_train['SalePrice']>400000)]
df_train = df_train.drop([185])
# df_train[(df_train['TotalBsmtSF']>5000)]
# df_train = df_train.drop([1298])
df_train[(df_train['1stFlrSF']>2700)]
df_train = df_train.drop([496,523,1024])
In [12]:
for i in df_train.columns:
if i != 'SalePrice':
data_observation(i,'SalePrice')
In [13]:
#Use this block to observe different features correlation
corrmat.nlargest(len(df_train),'Utilities')['SalePrice']
Out[13]:
In [14]:
def normalize(feature):
return (df_train[feature]-min(df_train[feature]))/max(df_train[feature])
group_1 = normalize('MSSubClass')+normalize('BldgType')+normalize('HouseStyle')
group_1 = group_1/3
group_2 = normalize('MSZoning')
group_3 = normalize('Street')+normalize('Alley')
group_3 = group_3/2
group_4 = normalize('LotShape')+normalize('LandContour')+normalize('LotConfig')+normalize('LandSlope')
group_4 = group_4/4
group_5 = normalize('Utilities')+normalize('SaleType')+normalize('SaleCondition')
group_5 = group_5/3
group_6 = normalize('Neighborhood')
group_7 = normalize('Condition1')+normalize('Condition2')+normalize('PavedDrive')
group_7 = group_7/3
group_8 = normalize('OverallQual')+normalize('OverallCond')
group_8 = group_8/2
group_9 = normalize('YearBuilt')+normalize('YearRemodAdd')+normalize('Functional')
group_9 = group_9/3
group_10 = normalize('RoofStyle')
group_11 = normalize('Exterior1st')+normalize('Exterior2nd')
group_11 = group_11/2
group_12 = normalize('MasVnrType')
group_13 = normalize('ExterQual')+normalize('ExterCond')
group_13 = group_13/2
group_14 = normalize('Foundation')
group_15 = normalize('BsmtQual')+normalize('BsmtCond')+normalize('BsmtExposure')+normalize('BsmtFinType1')+normalize('BsmtFinType2')
group_15 = group_15/5
group_16 = normalize('Heating')+normalize('HeatingQC')+normalize('CentralAir')+normalize('Electrical')
group_16 = group_16/4
group_17 = normalize('LowQualFinSF')
group_18 = normalize('BsmtFullBath')+normalize('BsmtHalfBath')+normalize('FullBath')+normalize('HalfBath')+normalize('BedroomAbvGr')
group_18 = group_18/5
group_19 = normalize('Fireplaces')+normalize('FireplaceQu')
group_19 = group_19/2
group_20 = normalize('GarageType')+normalize('GarageYrBlt')+normalize('GarageFinish')+normalize('GarageCars')+normalize('GarageQual')+normalize('GarageCond')
group_20 = group_20/6
group_21 = normalize('PoolQC')+normalize('Fence')+normalize('MiscFeature')
group_21 = group_21/3
group_22 = normalize('MoSold')+normalize('YrSold')+normalize('SaleType')+normalize('SaleCondition')
group_22 = group_22/4
In [15]:
#Create new X_data
Cat_train = pd.concat([group_1,group_2,group_3,group_4,group_5,group_6,group_7,group_8,group_9,group_10,group_11,group_12
,group_13,group_14,group_15,group_16,group_17,group_18,group_19,group_20,group_21,group_22],axis=1)
Cat_train.columns = ['group_1','group_2','group_3','group_4','group_5','group_6','group_7','group_8','group_9','group_10','group_11','group_12'
,'group_13','group_14','group_15','group_16','group_17','group_18','group_19','group_20','group_21','group_22']
In [16]:
# search for normality
from scipy.stats import norm
import scipy.stats as stats
def Normal_observe(feature):
sns.distplot(df_train[feature],fit=norm)
fig = plt.figure()
# res = stats.probplot(df_train[feature],plot=plt)
# for feature in df_train.columns:
# Normal_observe(feature)
In [17]:
# Log tranformation
df_train['SalePrice'] = np.log(df_train['SalePrice'])
df_train['GrLivArea'] = np.log(df_train['GrLivArea'])
Normal_observe('SalePrice')
Normal_observe('GrLivArea')
In [18]:
# simple test model
from sklearn import linear_model
from sklearn.model_selection import KFold
def simple_linear_test(train_data,n_fold,feature_list):
# use n-fold Cross validation
kf = KFold(n_fold).split(train_data)
reg = linear_model.LinearRegression()
part = 0
total = 0
y_train_data = train_data.loc[:,'SalePrice']
X_train_data = train_data.drop('SalePrice',axis=1).loc[:,feature_list]
for train_idx,test_idx in kf:
print("TRAIN:", len(train_idx), "TEST:", len(test_idx))
X_train, y_train = X_train_data.iloc[train_idx], y_train_data.iloc[train_idx]
X_test, y_test = X_train_data.iloc[test_idx], y_train_data.iloc[test_idx]
reg.fit(X_train,y_train)
pred_test = reg.predict(X_test)
part += 1
pred_score = rmse(pred_test,y_test)
total += pred_score
print ("%d part RMSE: %.7f" % (part,pred_score))
print ('Avg RMSE : ',total/n_fold)
In [19]:
# use for categorical data
import xgboost as xgb
xgb = xgb.XGBRegressor(
colsample_bytree=0.8,
gamma=0.0,
learning_rate=0.01,
max_depth=6,
min_child_weight=1.5,
n_estimators=10000,
reg_alpha=0.9,
reg_lambda=0.6,
subsample=0.8,
seed=42,
silent=False)
# create new training set by previous feature selection
# X_data = pd.concat([df_train['GrLivArea'],df_train['GarageArea'],df_train['TotalBsmtSF'],df_train['OverallQual'],df_train['FullBath'],df_train['Fireplaces'],df_train['YearBuilt'],df_train['YearRemodAdd']],axis=1)
# X_data = pd.concat([Cat_train,df_train['GrLivArea'],df_train['GarageArea'],df_train['MasVnrArea'],df_train['LotArea'],df_train['LotFrontage'],df_train['BsmtFinSF1'],df_train['BsmtFinSF2'],
# df_train['BsmtUnfSF'],df_train['TotalBsmtSF'],df_train['1stFlrSF'],df_train['2ndFlrSF']],axis=1)
X_data = Cat_train
y_data = df_train['SalePrice']
#split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data,y_data,test_size=0.2,random_state=0)
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
print("XGBoost score on training set: ", rmse(y_test, y_pred))
In [20]:
#input the data
df_test = pd.read_csv('test.csv')
In [21]:
df_test_null = df_test.isnull().sum()
df2 = pd.DataFrame()
for idx_num in range(len(df_test_null)):
if df_test_null.values[idx_num] != 0:
df_temp = pd.DataFrame(index = [df_test_null.index[idx_num]],data = [df_test_null.values[idx_num]])
df2 = df2.append(df_temp)
print (df2)
In [22]:
# Alley : NA means no alley, so change it to 'None'
df_test.loc[:,'Alley'] = df_test.loc[:,'Alley'].fillna('None')
# FireplaceQu : NA means no Fireplace
df_test.loc[:,'FireplaceQu'] = df_test.loc[:,'FireplaceQu'].fillna('None')
# PoolQC : NA means no pool
df_test.loc[:,'PoolQC'] = df_test.loc[:,'PoolQC'].fillna('None')
# Fence : NA means no fence
df_test.loc[:,'Fence'] = df_test.loc[:,'Fence'].fillna('None')
# MiscFeature : NA means no feature, and its MiscVal should be 0
df_test.loc[:,'MiscFeature'] = df_test.loc[:,'MiscFeature'].fillna('None')
## LotFrontage : make it 0 first
df_test.loc[:,'LotFrontage'] = df_test.loc[:,'LotFrontage'].fillna(0)
## MasVnrType : NaN most likely to be None, make it None first
df_test.loc[:,'MasVnrType'] = df_test.loc[:,'MasVnrType'].fillna('None')
## MasVnrArea : NaN is the same with MasVnrType, make it 0 first
df_test.loc[:,'MasVnrArea'] = df_test.loc[:,'MasVnrArea'].fillna(0)
# BsmtQual : NA means no basement
df_test.loc[:,'BsmtQual'] = df_test.loc[:,'BsmtQual'].fillna('None')
# BsmtCond : NA means no basement
df_test.loc[:,'BsmtCond'] = df_test.loc[:,'BsmtCond'].fillna('None')
# BsmtExposure : NA means no basement, but index 948 is strange, based on other data with unfinished basement, the Nan is mostly 'No'
df_test.loc[:,'BsmtExposure'] = df_test.loc[:,'BsmtExposure'].fillna('None')
# BsmtFinType1 : NA means no basement
df_test.loc[:,'BsmtFinType1'] = df_test.loc[:,'BsmtFinType1'].fillna('None')
# BsmtFinType2 : NA means no basement
df_test.loc[:,'BsmtFinType2'] = df_test.loc[:,'BsmtFinType2'].fillna('None')
# Electrical : only one NA, and hard to interpret, just drop the data
df_test = df_test[df_test['Electrical'].notnull()]
# GarageType, GarageYrBlt,GarageFinish,GarageQual,GarageCond : NA means no garage
df_test.loc[:,'GarageType'] = df_test.loc[:,'GarageType'].fillna('None')
df_test.loc[:,'GarageFinish'] = df_test.loc[:,'GarageFinish'].fillna('None')
df_test.loc[:,'GarageQual'] = df_test.loc[:,'GarageQual'].fillna('None')
df_test.loc[:,'GarageCond'] = df_test.loc[:,'GarageCond'].fillna('None')
## Tansfer GarageYrBlt later, fill in 0 first, then deal with it
df_test.loc[:,'GarageYrBlt'] = df_test.loc[:,'GarageYrBlt'].fillna(0)
## more special transfer for test data
df_test.loc[:,'MSZoning'] = df_test.loc[:,'MSZoning'].fillna('I')
df_test.loc[:,'Utilities'] = df_test.loc[:,'Utilities'].fillna('AllPub')
df_test.loc[:,'Exterior1st'] = df_test.loc[:,'Exterior1st'].fillna('Stone')
df_test.loc[:,'Exterior2nd'] = df_test.loc[:,'Exterior2nd'].fillna('Stucco')
df_test.loc[:,'BsmtFinSF1'] = df_test.loc[:,'BsmtFinSF1'].fillna(201)
df_test.loc[:,'BsmtFinSF2'] = df_test.loc[:,'BsmtFinSF2'].fillna(46)
df_test.loc[:,'BsmtUnfSF'] = df_test.loc[:,'BsmtUnfSF'].fillna(566.17)
df_test.loc[:,'BsmtFullBath'] = df_test.loc[:,'BsmtFullBath'].fillna(0)
df_test.loc[:,'BsmtHalfBath'] = df_test.loc[:,'BsmtHalfBath'].fillna(0)
df_test.loc[:,'KitchenQual'] = df_test.loc[:,'KitchenQual'].fillna('TA')
df_test.loc[:,'Functional'] = df_test.loc[:,'Functional'].fillna('Sal')
df_test.loc[:,'GarageCars'] = df_test.loc[:,'GarageCars'].fillna(2)
df_test.loc[:,'GarageArea'] = df_test.loc[:,'GarageYrBlt'].fillna(470.83)
df_test.loc[:,'SaleType'] = df_test.loc[:,'GarageYrBlt'].fillna('ConLD')
df_test.loc[:,'TotalBsmtSF'] = df_test.loc[:,'TotalBsmtSF'].fillna(1047.64)
In [23]:
# make sure not null points
df_test_null = df_test.isnull().sum()
df2 = pd.DataFrame()
for idx_num in range(len(df_test_null)):
if df_test_null.values[idx_num] != 0:
df_temp = pd.DataFrame(index = [df_test_null.index[idx_num]],data = [df_test_null.values[idx_num]])
df2 = df2.append(df_temp)
print (df2)
In [24]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
def apply_label_encoder(data,feature_name):
le.fit(df_test[feature_name])
data[feature_name] = le.transform(data[feature_name])
labelEncoder_list = ['MSSubClass','MSZoning','Street','Alley','LandContour','LotConfig','Neighborhood','Condition1',
'Condition2','BldgType','HouseStyle','YearBuilt','YearRemodAdd','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','MasVnrArea',
'Foundation','BsmtFinSF1','Heating','CentralAir','Electrical','GarageYrBlt','Fence','MiscFeature','MoSold','YrSold','SaleType','SaleCondition']
for feature in labelEncoder_list:
apply_label_encoder(df_test,feature)
def apply_qualify_encoder(data,feature_name):
return data.replace({feature_name:{"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}})
qual_labelEncoder_list = ['ExterCond','ExterQual','HeatingQC','KitchenQual','BsmtQual','BsmtCond','FireplaceQu','GarageCond','GarageQual','PoolQC']
for feature in qual_labelEncoder_list:
df_test = apply_qualify_encoder(df_test,feature)
#special one
df_test = df_test.replace({"BsmtExposure" : {"None" : 0,"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
"BsmtFinType1" : {"None" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, "ALQ" : 5, "GLQ" : 6},
"BsmtFinType2" : {"None" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, "ALQ" : 5, "GLQ" : 6},
"Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5,
"Min2" : 6, "Min1" : 7, "Typ" : 8},
"LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
"LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
"PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
"Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4},
"GarageFinish": {"None" : 0, "Unf":1, "RFn":2,"Fin":3},
"GarageType" : {"None":0, "Detchd":1, "CarPort":2, "BuiltIn":3, "Basment":4, "Attchd":5,"2Types":6}})
In [25]:
# apply same rule like the training data
def normalize2(feature):
return (df_test[feature]-min(df_test[feature]))/max(df_train[feature])
group2_1 = normalize2('MSSubClass')+normalize2('BldgType')+normalize2('HouseStyle')
group2_1 = group2_1/3
group2_2 = normalize2('MSZoning')
group2_3 = normalize2('Street')+normalize2('Alley')
group2_3 = group2_3/2
group2_4 = normalize2('LotShape')+normalize2('LandContour')+normalize2('LotConfig')+normalize2('LandSlope')
group2_4 = group2_4/4
group2_5 = normalize2('Utilities')+normalize2('SaleType')+normalize2('SaleCondition')
group2_5 = group2_5/3
group2_6 = normalize2('Neighborhood')
group2_7 = normalize2('Condition1')+normalize2('Condition2')+normalize2('PavedDrive')
group2_7 = group2_7/3
group2_8 = normalize2('OverallQual')+normalize2('OverallCond')
group2_8 = group2_8/2
group2_9 = normalize2('YearBuilt')+normalize2('YearRemodAdd')+normalize2('Functional')
group2_9 = group2_9/3
group2_10 = normalize2('RoofStyle')
group2_11 = normalize2('Exterior1st')+normalize2('Exterior2nd')
group2_11 = group2_11/2
group2_12 = normalize2('MasVnrType')
group2_13 = normalize2('ExterQual')+normalize2('ExterCond')
group2_13 = group2_13/2
group2_14 = normalize2('Foundation')
group2_15 = normalize2('BsmtQual')+normalize2('BsmtCond')+normalize2('BsmtExposure')+normalize2('BsmtFinType1')+normalize2('BsmtFinType2')
group2_15 = group2_15/5
group2_16 = normalize2('Heating')+normalize2('HeatingQC')+normalize2('CentralAir')+normalize2('Electrical')
group2_16 = group2_16/4
group2_17 = normalize2('LowQualFinSF')
group2_18 = normalize2('BsmtFullBath')+normalize2('BsmtHalfBath')+normalize2('FullBath')+normalize2('HalfBath')+normalize2('BedroomAbvGr')
group2_18 = group2_18/5
group2_19 = normalize2('Fireplaces')+normalize2('FireplaceQu')
group2_19 = group2_19/2
group2_20 = normalize2('GarageType')+normalize2('GarageYrBlt')+normalize2('GarageFinish')+normalize2('GarageCars')+normalize2('GarageQual')+normalize2('GarageCond')
group2_20 = group2_20/6
group2_21 = normalize2('PoolQC')+normalize2('Fence')+normalize2('MiscFeature')
group2_21 = group2_21/3
group2_22 = normalize2('MoSold')+normalize2('YrSold')+normalize2('SaleType')+normalize2('SaleCondition')
group2_22 = group2_22/4
In [26]:
X_Cat_test = pd.concat([group2_1,group2_2,group2_3,group2_4,group2_5,group2_6,group2_7,group2_8,group2_9,group2_10,group2_11,group2_12
,group2_13,group2_14,group2_15,group2_16,group2_17,group2_18,group2_19,group2_20,group2_21,group2_22],axis=1)
X_Cat_test.columns = ['group_1','group_2','group_3','group_4','group_5','group_6','group_7','group_8','group_9','group_10','group_11','group_12'
,'group_13','group_14','group_15','group_16','group_17','group_18','group_19','group_20','group_21','group_22']
In [27]:
# Log tranformation
df_test['GrLivArea'] = np.log(df_test['GrLivArea'])
In [28]:
X_data = X_Cat_test
y_pred_data = xgb.predict(X_data)
y_pred_data = np.exp(y_pred_data)
pd.DataFrame({'Id': df_test['Id'], 'SalePrice':y_pred_data}).to_csv('result.csv', index =False)
In [ ]: